library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(naniar)
BC_data <- read_tsv("GSE96058_expression_matrix_3decimals.tsv")
## Rows: 30865 Columns: 3410
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Unnamed: 0
## dbl (3409): F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13, F14, F15...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
textfile <- read_tsv("GSE96058_metadata.txt")
## Rows: 3069 Columns: 31
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (12): sample_id, title, geo_accession, type, source_name_ch1, instrument...
## dbl (19): channel_count, age at diagnosis:ch1, chemo treated:ch1, endocrine ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(textfile)
## sample_id title geo_accession type
## Length:3069 Length:3069 Length:3069 Length:3069
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## channel_count source_name_ch1 age at diagnosis:ch1 chemo treated:ch1
## Min. :1 Length:3069 Min. :24.00 Min. :0.0000
## 1st Qu.:1 Class :character 1st Qu.:53.00 1st Qu.:0.0000
## Median :1 Mode :character Median :64.00 Median :0.0000
## Mean :1 Mean :62.76 Mean :0.4072
## 3rd Qu.:1 3rd Qu.:71.00 3rd Qu.:1.0000
## Max. :1 Max. :96.00 Max. :1.0000
## NA's :21
## endocrine treated:ch1 er prediction mgc:ch1 er prediction sgc:ch1
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.7765 Mean :0.7983 Mean :0.8866
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :22
## er status:ch1 her2 prediction mgc:ch1 her2 prediction sgc:ch1
## Min. :0.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :1.000 Median :0.0000 Median :0.00000
## Mean :0.922 Mean :0.1297 Mean :0.09286
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.0000 Max. :1.00000
## NA's :199
## her2 status:ch1 instrument model:ch1 ki67 prediction mgc:ch1
## Min. :0.0000 Length:3069 Min. :0.0000
## 1st Qu.:0.0000 Class :character 1st Qu.:0.0000
## Median :0.0000 Mode :character Median :0.0000
## Mean :0.1323 Mean :0.3001
## 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## NA's :105
## ki67 prediction sgc:ch1 ki67 status:ch1 lymph node group:ch1
## Min. :0.0000 Min. :0.0000 Length:3069
## 1st Qu.:0.0000 1st Qu.:0.0000 Class :character
## Median :1.0000 Median :1.0000 Mode :character
## Mean :0.5888 Mean :0.5862
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## NA's :1682
## lymph node status:ch1 nhg prediction mgc:ch1 nhg:ch1
## Length:3069 Length:3069 Length:3069
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## overall survival days:ch1 overall survival event:ch1 pam50 subtype:ch1
## Min. : 56 Min. :0.0000 Length:3069
## 1st Qu.:1236 1st Qu.:0.0000 Class :character
## Median :1612 Median :0.0000 Mode :character
## Mean :1606 Mean :0.1049
## 3rd Qu.:2004 3rd Qu.:0.0000
## Max. :2474 Max. :1.0000
##
## pgr prediction mgc:ch1 pgr prediction sgc:ch1 pgr status:ch1
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.7739 Mean :0.7758 Mean :0.8678
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :330
## scan-b external id:ch1 tumor size:ch1
## Length:3069 Min. : 0.00
## Class :character 1st Qu.: 12.00
## Mode :character Median : 17.00
## Mean : 19.98
## 3rd Qu.: 24.00
## Max. :126.00
## NA's :32
BC_data_new <- column_to_rownames(BC_data, var = "Unnamed: 0")
BC_data_new2 <- t(BC_data_new)
BC_data_new2 <- as.data.frame(t(BC_data_new))
BC_data_new2 <- rownames_to_column(BC_data_new2, var = "title")
merged_data_2 <- left_join(textfile, BC_data_new2, by = "title")
reduced_merged_data <- merged_data_2 %>%
select(`overall survival days:ch1`, `overall survival event:ch1`, `ABHD11`, `ESR1`, `chemo treated:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `er status:ch1`, `her2 status:ch1`, `pgr status:ch1`, `endocrine treated:ch1`)
reduced_merged_data$`chemo treated:ch1`<- as.factor(reduced_merged_data$`chemo treated:ch1`)
reduced_merged_data$`overall survival event:ch1`<- as.factor(reduced_merged_data$`overall survival event:ch1`)
reduced_merged_data$`er status:ch1`<- as.factor(reduced_merged_data$`er status:ch1`)
reduced_merged_data$`her2 status:ch1`<- as.factor(reduced_merged_data$`her2 status:ch1`)
reduced_merged_data$`pgr status:ch1`<- as.factor(reduced_merged_data$`pgr status:ch1`)
reduced_merged_data$`endocrine treated:ch1` <- as.factor(reduced_merged_data$`endocrine treated:ch1`)
print(reduced_merged_data)
## # A tibble: 3,069 × 11
## `overall survival days:ch1` `overall survival event:ch1` ABHD11 ESR1
## <dbl> <fct> <dbl> <dbl>
## 1 2367 0 4.13 0.962
## 2 2367 0 4.34 6.17
## 3 2168 1 5.27 7.98
## 4 2416 0 5.87 6.05
## 5 2389 0 3.32 5.50
## 6 2373 0 4.51 0.263
## 7 2380 0 6.47 1.14
## 8 2324 0 5.66 5.97
## 9 2367 0 4.50 6.54
## 10 1962 1 5.86 7.57
## # ℹ 3,059 more rows
## # ℹ 7 more variables: `chemo treated:ch1` <fct>, `age at diagnosis:ch1` <dbl>,
## # `tumor size:ch1` <dbl>, `er status:ch1` <fct>, `her2 status:ch1` <fct>,
## # `pgr status:ch1` <fct>, `endocrine treated:ch1` <fct>
metadata_missing <- reduced_merged_data %>%
select(where(~ any(is.na(.))))
vis_miss(reduced_merged_data, sort_miss = TRUE) +
theme_minimal(base_size = 12) +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.title.x = element_blank()
) +
labs(title = "Missing Data in Metadata")
location_missing_data_chemo <- textfile %>%
filter(is.na(`chemo treated:ch1`)) %>%
select(sample_id, `er status:ch1`, `pgr status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `tumor size:ch1`, `age at diagnosis:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`)
location_missing_data_tumor_size <- textfile %>%
filter(is.na(`tumor size:ch1`)) %>%
select(sample_id, `er status:ch1`, `pgr status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)
location_missing_data_tumor_size <- textfile %>%
filter(is.na(`tumor size:ch1`)) %>%
select(sample_id, `er status:ch1`, `pgr status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)
location_missing_data_her2 <- textfile %>%
filter(is.na(`her2 status:ch1`)) %>%
select(sample_id, `er status:ch1`, `pgr status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)
location_missing_data_er <- textfile %>%
filter(is.na(`er status:ch1`)) %>%
select(sample_id, `pgr status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)
location_missing_data_pgr <- textfile %>%
filter(is.na(`pgr status:ch1`)) %>%
select(sample_id, `er status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)
location_missing_data_endo <- textfile %>%
filter(is.na(`endocrine treated:ch1`)) %>%
select(sample_id, `er status:ch1`, `her2 status:ch1`, `ki67 status:ch1`, `nhg:ch1`, `lymph node group:ch1`, `lymph node status:ch1`, `age at diagnosis:ch1`, `tumor size:ch1`, `overall survival event:ch1`, `endocrine treated:ch1`, `chemo treated:ch1`)
print(location_missing_data_chemo)
## # A tibble: 21 × 12
## sample_id `er status:ch1` `pgr status:ch1` `her2 status:ch1`
## <chr> <dbl> <dbl> <dbl>
## 1 GSM2528173 1 1 0
## 2 GSM2528236 NA NA 0
## 3 GSM2528339 1 1 0
## 4 GSM2528393 1 1 0
## 5 GSM2528463 0 0 0
## 6 GSM2528524 NA NA 0
## 7 GSM2528605 1 1 NA
## 8 GSM2528973 0 0 0
## 9 GSM2529367 1 1 0
## 10 GSM2529380 0 0 1
## # ℹ 11 more rows
## # ℹ 8 more variables: `ki67 status:ch1` <dbl>, `nhg:ch1` <chr>,
## # `lymph node group:ch1` <chr>, `lymph node status:ch1` <chr>,
## # `tumor size:ch1` <dbl>, `age at diagnosis:ch1` <dbl>,
## # `overall survival event:ch1` <dbl>, `endocrine treated:ch1` <dbl>
print(location_missing_data_tumor_size)
## # A tibble: 32 × 12
## sample_id `er status:ch1` `pgr status:ch1` `her2 status:ch1`
## <chr> <dbl> <dbl> <dbl>
## 1 GSM2528105 NA NA 1
## 2 GSM2528317 0 0 1
## 3 GSM2528319 1 1 1
## 4 GSM2528520 1 0 0
## 5 GSM2528579 1 0 0
## 6 GSM2528584 NA NA 0
## 7 GSM2528746 NA NA 0
## 8 GSM2528901 1 1 0
## 9 GSM2528950 1 1 0
## 10 GSM2528953 1 1 0
## # ℹ 22 more rows
## # ℹ 8 more variables: `ki67 status:ch1` <dbl>, `nhg:ch1` <chr>,
## # `lymph node group:ch1` <chr>, `lymph node status:ch1` <chr>,
## # `age at diagnosis:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## # `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
print(location_missing_data_her2)
## # A tibble: 105 × 12
## sample_id `er status:ch1` `pgr status:ch1` `ki67 status:ch1` `nhg:ch1`
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 GSM2528129 1 1 NA G3
## 2 GSM2528213 1 1 NA G1
## 3 GSM2528233 1 1 NA G1
## 4 GSM2528322 1 1 NA G2
## 5 GSM2528514 0 0 1 <NA>
## 6 GSM2528521 1 1 NA <NA>
## 7 GSM2528605 1 1 1 G3
## 8 GSM2528616 1 1 0 G3
## 9 GSM2528617 1 1 0 G2
## 10 GSM2528620 1 0 1 G2
## # ℹ 95 more rows
## # ℹ 7 more variables: `lymph node group:ch1` <chr>,
## # `lymph node status:ch1` <chr>, `age at diagnosis:ch1` <dbl>,
## # `tumor size:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## # `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
print(location_missing_data_er)
## # A tibble: 199 × 12
## sample_id `pgr status:ch1` `her2 status:ch1` `ki67 status:ch1` `nhg:ch1`
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 GSM2528079 NA 0 NA G3
## 2 GSM2528084 NA 0 NA G3
## 3 GSM2528093 NA 0 NA G3
## 4 GSM2528094 NA 0 NA G3
## 5 GSM2528105 NA 1 NA G3
## 6 GSM2528106 NA 1 NA G3
## 7 GSM2528107 NA 1 NA G3
## 8 GSM2528121 NA 1 NA G3
## 9 GSM2528126 NA 0 NA G3
## 10 GSM2528127 NA 0 NA G3
## # ℹ 189 more rows
## # ℹ 7 more variables: `lymph node group:ch1` <chr>,
## # `lymph node status:ch1` <chr>, `age at diagnosis:ch1` <dbl>,
## # `tumor size:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## # `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
print(location_missing_data_pgr)
## # A tibble: 330 × 12
## sample_id `er status:ch1` `her2 status:ch1` `ki67 status:ch1` `nhg:ch1`
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 GSM2528079 NA 0 NA G3
## 2 GSM2528082 1 1 NA G3
## 3 GSM2528083 1 0 NA G2
## 4 GSM2528084 NA 0 NA G3
## 5 GSM2528087 1 1 NA G1
## 6 GSM2528088 1 1 NA G3
## 7 GSM2528093 NA 0 NA G3
## 8 GSM2528094 NA 0 NA G3
## 9 GSM2528101 1 0 NA G2
## 10 GSM2528105 NA 1 NA G3
## # ℹ 320 more rows
## # ℹ 7 more variables: `lymph node group:ch1` <chr>,
## # `lymph node status:ch1` <chr>, `age at diagnosis:ch1` <dbl>,
## # `tumor size:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## # `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
print(location_missing_data_endo)
## # A tibble: 22 × 12
## sample_id `er status:ch1` `her2 status:ch1` `ki67 status:ch1` `nhg:ch1`
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 GSM2528173 1 0 0 G1
## 2 GSM2528236 NA 0 NA G3
## 3 GSM2528339 1 0 0 G1
## 4 GSM2528393 1 0 0 G1
## 5 GSM2528463 0 0 1 G3
## 6 GSM2528524 NA 0 1 G3
## 7 GSM2528605 1 NA 1 G3
## 8 GSM2528973 0 0 1 G2
## 9 GSM2529367 1 0 0 <NA>
## 10 GSM2529380 0 1 1 G3
## # ℹ 12 more rows
## # ℹ 7 more variables: `lymph node group:ch1` <chr>,
## # `lymph node status:ch1` <chr>, `age at diagnosis:ch1` <dbl>,
## # `tumor size:ch1` <dbl>, `overall survival event:ch1` <dbl>,
## # `endocrine treated:ch1` <dbl>, `chemo treated:ch1` <dbl>
mode_level_her2 <- names(which.max(table(reduced_merged_data$`her2 status:ch1`)))
mode_level_tumor_size <- names(which.max(table(reduced_merged_data$`tumor size:ch1`)))
mode_level_chemo <- names(which.max(table(reduced_merged_data$`chemo treated:ch1`)))
mode_level_er <- names(which.max(table(reduced_merged_data$`er status:ch1`)))
mode_level_pgr <- names(which.max(table(reduced_merged_data$`pgr status:ch1`)))
mode_level_endo <- names(which.max(table(reduced_merged_data$`endocrine treated:ch1`)))
no_NA_reduced_merged_data <- reduced_merged_data %>%
mutate(
`chemo treated:ch1` = replace(`chemo treated:ch1`,
is.na(`chemo treated:ch1`),
mode_level_chemo),
`er status:ch1` = replace(`er status:ch1`,
is.na(`er status:ch1`),
mode_level_er),
`pgr status:ch1` = replace(`pgr status:ch1`,
is.na(`pgr status:ch1`),
mode_level_pgr),
`endocrine treated:ch1` = replace(`endocrine treated:ch1`,
is.na(`endocrine treated:ch1`),
mode_level_endo)
)
no_NA_reduced_merged_data <- no_NA_reduced_merged_data %>%
drop_na(`her2 status:ch1`) %>%
drop_na(`tumor size:ch1`)
print(no_NA_reduced_merged_data)
## # A tibble: 2,938 × 11
## `overall survival days:ch1` `overall survival event:ch1` ABHD11 ESR1
## <dbl> <fct> <dbl> <dbl>
## 1 2367 0 4.13 0.962
## 2 2367 0 4.34 6.17
## 3 2168 1 5.27 7.98
## 4 2416 0 5.87 6.05
## 5 2389 0 3.32 5.50
## 6 2373 0 4.51 0.263
## 7 2380 0 6.47 1.14
## 8 2324 0 5.66 5.97
## 9 2367 0 4.50 6.54
## 10 1962 1 5.86 7.57
## # ℹ 2,928 more rows
## # ℹ 7 more variables: `chemo treated:ch1` <fct>, `age at diagnosis:ch1` <dbl>,
## # `tumor size:ch1` <dbl>, `er status:ch1` <fct>, `her2 status:ch1` <fct>,
## # `pgr status:ch1` <fct>, `endocrine treated:ch1` <fct>
library(dplyr)
Q1 <- quantile(no_NA_reduced_merged_data$ABHD11, 0.25, na.rm = TRUE)
Q3 <- quantile(no_NA_reduced_merged_data$ABHD11, 0.75, na.rm = TRUE)
IQR_value <- IQR(no_NA_reduced_merged_data$ABHD11, na.rm = TRUE)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- no_NA_reduced_merged_data %>%
filter(ABHD11 < lower_bound | ABHD11 > upper_bound)
num_outliers <- nrow(outliers)
print(paste("Number of outliers:", num_outliers))
## [1] "Number of outliers: 54"
print(num_outliers)
## [1] 54
ggplot_outlier_detection <- ggplot(no_NA_reduced_merged_data, aes(y = ABHD11, x = 1)) +
geom_jitter(
width = 0.2,
alpha = 0.3,
color = "grey50"
) +
geom_boxplot(
width = 0.3,
outlier.color = "black",
outlier.size = 1.5,
fill = NA,
color = "black"
) +
labs(
title = "ABHD11 expression",
x = "",
y = "ABHD11 expression"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.title = element_text(face = "bold")
)
print(ggplot_outlier_detection)
library(ggplot2)
Histogram_ABHD11 <- ggplot(no_NA_reduced_merged_data, aes(x = ABHD11)) +
geom_histogram(
bins = 60,
fill = "steelblue",
color = "black",
alpha = 0.7
) +
labs(
title = "Histogram of ABHD11",
x = "ABHD11 expression",
y = "Frequency"
) +
theme_minimal(base_size = 14)
print(Histogram_ABHD11)
Scatterplot_ABHD11 <- ggplot(no_NA_reduced_merged_data,
aes(x = ABHD11, y = `tumor size:ch1`)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = TRUE, color = "blue") +
labs(
x = "ABHD11 expression (log-transformed)",
y = "Tumor size (cm)",
title = "Scatterplot of ABHD11 Expression vs Tumor Size"
) +
theme_minimal()
print(Scatterplot_ABHD11)
## `geom_smooth()` using formula = 'y ~ x'
no_NA_reduced_merged_data$`tumor size:ch1` <- as.numeric(no_NA_reduced_merged_data$`tumor size:ch1`)
# Stap 1: filter for small tumors (<2 cm)
small_tumors <- no_NA_reduced_merged_data[no_NA_reduced_merged_data$`tumor size:ch1` < 2, ]
spearman_test <- cor.test(
small_tumors$ABHD11,
small_tumors$`tumor size:ch1`,
method = "spearman"
)
## Warning in cor.test.default(small_tumors$ABHD11, small_tumors$`tumor size:ch1`,
## : Cannot compute exact p-value with ties
print(spearman_test)
##
## Spearman's rank correlation rho
##
## data: small_tumors$ABHD11 and small_tumors$`tumor size:ch1`
## S = 530.72, p-value = 0.5696
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1664101
Scatterplot_loess <- ggplot(small_tumors, aes(x = ABHD11, y = `tumor size:ch1`)) +
geom_point() +
geom_smooth(method = "loess", color = "blue") +
labs(
x = "ABHD11 expression (log-transformed)",
y = "Tumor size (cm)",
title = "Scatterplot: ABHD11 vs Tumor Size (<2 cm)"
) +
theme_minimal()
print(Scatterplot_loess)
## `geom_smooth()` using formula = 'y ~ x'
spearman_test <- cor.test(
no_NA_reduced_merged_data$ABHD11,
no_NA_reduced_merged_data$`tumor size:ch1`,
method = "spearman"
)
## Warning in cor.test.default(no_NA_reduced_merged_data$ABHD11,
## no_NA_reduced_merged_data$`tumor size:ch1`, : Cannot compute exact p-value with
## ties
print(spearman_test)
##
## Spearman's rank correlation rho
##
## data: no_NA_reduced_merged_data$ABHD11 and no_NA_reduced_merged_data$`tumor size:ch1`
## S = 3725756477, p-value = 1.161e-10
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.1185242
library(ggplot2)
Scatterplot_3 <- ggplot(no_NA_reduced_merged_data, aes(x = ABHD11, y = `tumor size:ch1`)) +
geom_point() +
geom_smooth(method = "loess", color = "blue") +
labs(
x = "ABHD11 expression (log-transformed)",
y = "Tumor size (cm)",
title = "Scatterplot: ABHD11 vs Tumor Size"
) +
theme_minimal()
print(Scatterplot_3)
## `geom_smooth()` using formula = 'y ~ x'
library(matrixStats)
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
library(Rtsne)
set.seed(42)
BC_data_asdata <- as.data.frame(BC_data_new)
BC_data_asdata[] <- lapply(BC_data_asdata, function(x) as.numeric(trimws(x)))
BC_data_matrix <- as.matrix(BC_data_asdata)
gene_var <- rowVars(BC_data_matrix) # numeric vector of variances
names(gene_var) <- rownames(BC_data_matrix) # label variance values with gene names
n_top <- min(1000, nrow(BC_data_matrix)) # if fewer than 1000 genes, adjust automatically
top_genes <- names(sort(gene_var, decreasing = TRUE))[1:n_top]
expr_top <- BC_data_matrix[top_genes, ] # still genes x samples
expr_top_t <- t(expr_top)
expr_scaled <- scale(expr_top_t)
tsne_res <- Rtsne(
expr_scaled,
dims = 2,
perplexity = 30, # adjust depending on sample size
verbose = TRUE,
pca = FALSE
)
## Read the 3409 x 1000 data matrix successfully!
## Using no_dims = 2, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 22.49 seconds (sparsity = 0.047856)!
## Learning embedding...
## Iteration 50: error is 84.638146 (50 iterations in 1.05 seconds)
## Iteration 100: error is 83.534885 (50 iterations in 0.97 seconds)
## Iteration 150: error is 84.504285 (50 iterations in 0.87 seconds)
## Iteration 200: error is 82.771476 (50 iterations in 1.13 seconds)
## Iteration 250: error is 82.853195 (50 iterations in 1.21 seconds)
## Iteration 300: error is 2.964818 (50 iterations in 0.80 seconds)
## Iteration 350: error is 2.816146 (50 iterations in 0.63 seconds)
## Iteration 400: error is 2.763845 (50 iterations in 0.63 seconds)
## Iteration 450: error is 2.738270 (50 iterations in 0.63 seconds)
## Iteration 500: error is 2.725157 (50 iterations in 0.63 seconds)
## Iteration 550: error is 2.718625 (50 iterations in 0.63 seconds)
## Iteration 600: error is 2.714560 (50 iterations in 0.63 seconds)
## Iteration 650: error is 2.711677 (50 iterations in 0.63 seconds)
## Iteration 700: error is 2.708887 (50 iterations in 0.63 seconds)
## Iteration 750: error is 2.706729 (50 iterations in 0.64 seconds)
## Iteration 800: error is 2.705620 (50 iterations in 0.63 seconds)
## Iteration 850: error is 2.704790 (50 iterations in 0.63 seconds)
## Iteration 900: error is 2.703746 (50 iterations in 0.63 seconds)
## Iteration 950: error is 2.702862 (50 iterations in 0.64 seconds)
## Iteration 1000: error is 2.701858 (50 iterations in 0.63 seconds)
## Fitting performed in 14.89 seconds.
summary(tsne_res)
## Length Class Mode
## N 1 -none- numeric
## Y 6818 -none- numeric
## costs 3409 -none- numeric
## itercosts 20 -none- numeric
## origD 1 -none- numeric
## perplexity 1 -none- numeric
## theta 1 -none- numeric
## max_iter 1 -none- numeric
## stop_lying_iter 1 -none- numeric
## mom_switch_iter 1 -none- numeric
## momentum 1 -none- numeric
## final_momentum 1 -none- numeric
## eta 1 -none- numeric
## exaggeration_factor 1 -none- numeric
tsne_df <- data.frame(
Sample = rownames(expr_scaled),
TSNE1 = tsne_res$Y[,1],
TSNE2 = tsne_res$Y[,2],
stringsAsFactors = FALSE
)
head(tsne_df)
Scatter_tSNE <- ggplot(tsne_df, aes(x = TSNE1, y = TSNE2)) +
geom_point(color = "steelblue", size = 2, alpha = 0.7) + # punten
theme_minimal() + # nette achtergrond
labs(
title = "t-SNE 2-Dimensional Digit Visualization",
x = "t-SNE Dimension 1",
y = "t-SNE Dimension 2"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 16),
axis.title = element_text(size = 14)
)
print(Scatter_tSNE)
tsne_merged <- merge(
tsne_df,
merged_data_2,
by.x = "Sample",
by.y = "title",
all.x = TRUE
)
Plot_chemo <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`chemo treated:ch1`))) +
geom_point(size = 2, alpha = 0.8) +
scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
theme_minimal() +
labs(title = "t-SNE colored by Chemo Treatment",
x = "t-SNE 1", y = "t-SNE 2")
print(Plot_chemo)
plot_surv <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`overall survival event:ch1`))) +
geom_point(size = 2, alpha = 0.8) +
scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
theme_minimal() +
labs(title = "t-SNE colored by overall survival event",
x = "t-SNE 1", y = "t-SNE 2")
print(plot_surv)
plot_er <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`er status:ch1`))) +
geom_point(size = 2, alpha = 0.8) +
scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
theme_minimal() +
labs(title = "t-SNE colored by ER status",
x = "t-SNE 1", y = "t-SNE 2")
print(plot_er)
plot_her2 <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`her2 status:ch1`))) +
geom_point(size = 2, alpha = 0.8) +
scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
theme_minimal() +
labs(title = "t-SNE colored by HER2 status",
x = "t-SNE 1", y = "t-SNE 2")
print(plot_her2)
plot_pgr <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = factor(`pgr status:ch1`))) +
geom_point(size = 2, alpha = 0.8) +
scale_color_manual(values = c("steelblue", "firebrick"), labels = c("No", "Yes")) +
theme_minimal() +
labs(title = "t-SNE colored by PGR status",
x = "t-SNE 1", y = "t-SNE 2")
print(plot_pgr)
plot_aad <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = `age at diagnosis:ch1`)) +
geom_point(size = 2, alpha = 0.8) +
scale_color_viridis_c(option = "plasma") +
theme_minimal() +
labs(title = "t-SNE colored by Age at Diagnosis",
x = "t-SNE 1", y = "t-SNE 2")
print(plot_aad)
plot_osd <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = `overall survival days:ch1`)) +
geom_point(size = 2, alpha = 0.8) +
scale_color_viridis_c(option = "plasma") +
theme_minimal() +
labs(title = "t-SNE colored by overall survival days",
x = "t-SNE 1", y = "t-SNE 2")
print(plot_osd)
plot_tumor <- ggplot(tsne_merged, aes(x = TSNE1, y = TSNE2, color = `tumor size:ch1`)) +
geom_point(size = 2, alpha = 0.8) +
scale_color_viridis_c(option = "plasma") +
theme_minimal() +
labs(title = "t-SNE colored by tumor size",
x = "t-SNE 1", y = "t-SNE 2")
print(plot_tumor)
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.26.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.16
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
# Step 1: Calculate variance
gene_var <- apply(BC_data_new, 1, var)
gene_var_filtered <- gene_var[gene_var > 0.01]
top_genes <- names(sort(gene_var_filtered, decreasing = TRUE))[1:500]
# Step 2: Prepare matrix
BC_data_heatmap <- BC_data_new[top_genes, ]
BC_data_heatmap <- as.matrix(BC_data_heatmap)
ht <- Heatmap(
BC_data_heatmap,
name = "Expression",
show_row_names = FALSE,
show_column_names = FALSE,
clustering_method_rows = "complete",
clustering_method_columns = "complete",
column_title = "Heatmap of top 500 most variable genes",
row_title = "Genes"
)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
draw(ht)
library(ComplexHeatmap)
library(circlize)
BC_data_heatmap_2 <- t(scale(t(BC_data_heatmap))) # optional scaling
# Step 3: Save to file
ht_2 <- Heatmap(
BC_data_heatmap_2,
name = "Expression",
show_row_names = FALSE,
show_column_names = FALSE,
clustering_method_rows = "complete",
clustering_method_columns = "complete",
column_title = "Heatmap of top 500 most variable genes",
row_title = "Genes"
)
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
draw(ht_2)
data_out <- merged_data_2
vars <- data_out[, c("ESR1",
"chemo treated:ch1",
"overall survival days:ch1",
"age at diagnosis:ch1",
"tumor size:ch1",
"er status:ch1",
"her2 status:ch1",
"pgr status:ch1")]
vars_num <- data.frame(lapply(vars, function(x) as.numeric(as.character(x))))
complete_idx <- complete.cases(vars_num)
vars_complete <- vars_num[complete_idx, ]
cov_matrix <- cov(vars_complete)
center <- colMeans(vars_complete)
mahal_dist <- mahalanobis(vars_complete, center, cov_matrix)
threshold <- qchisq(0.999, df = ncol(vars_complete))
outliers_local <- which(mahal_dist > threshold)
outliers <- which(complete_idx)[outliers_local]
print(outliers)
## [1] 92 238 300 398 414 438 445 454 466 478 514 529 553 578 599
## [16] 640 641 737 740 784 786 967 1116 1164 1395 1436 1445 1468 1477 1485
## [31] 1504 1554 1558 1571 1614 1632 1644 1657 1707 1764 1817 1820 1836 1863 1869
## [46] 1892 1897 1899 1949 1964 1992 1994 2004 2011 2020 2022 2040 2100 2111 2123
## [61] 2151 2174 2207 2225 2252 2260 2273 2343 2493 2678 2687 2739 2773 2830 2833
## [76] 2841 2842 2845 2883 2885 2906 2914 2927 2938 2978 3012 3029 3041 3042 3067
no_NA_reduced_merged_data$`chemo treated:ch1`<- as.factor(no_NA_reduced_merged_data$`chemo treated:ch1`)
no_NA_reduced_merged_data$`overall survival event:ch1` <- as.numeric(no_NA_reduced_merged_data$`overall survival event:ch1`)
no_NA_reduced_merged_data$`overall survival days:ch1` <- as.numeric(no_NA_reduced_merged_data$`overall survival days:ch1`)
library(survival)
cox_model <- coxph(Surv(`overall survival days:ch1`, `overall survival event:ch1`) ~
ESR1 * `chemo treated:ch1` + `age at diagnosis:ch1` + `tumor size:ch1` + `er status:ch1` + `her2 status:ch1` + `pgr status:ch1`,
data = no_NA_reduced_merged_data)
summary(cox_model)
## Call:
## coxph(formula = Surv(`overall survival days:ch1`, `overall survival event:ch1`) ~
## ESR1 * `chemo treated:ch1` + `age at diagnosis:ch1` + `tumor size:ch1` +
## `er status:ch1` + `her2 status:ch1` + `pgr status:ch1`,
## data = no_NA_reduced_merged_data)
##
## n= 2938, number of events= 318
##
## coef exp(coef) se(coef) z Pr(>|z|)
## ESR1 -0.117756 0.888912 0.028908 -4.073 4.63e-05 ***
## `chemo treated:ch1`1 0.400522 1.492604 0.249252 1.607 0.1081
## `age at diagnosis:ch1` 0.067769 1.070118 0.005990 11.313 < 2e-16 ***
## `tumor size:ch1` 0.019055 1.019238 0.002783 6.846 7.60e-12 ***
## `er status:ch1`1 0.340241 1.405287 0.266274 1.278 0.2013
## `her2 status:ch1`1 0.159112 1.172469 0.170176 0.935 0.3498
## `pgr status:ch1`1 -0.277404 0.757749 0.203278 -1.365 0.1724
## ESR1:`chemo treated:ch1`1 -0.110152 0.895698 0.047096 -2.339 0.0193 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## ESR1 0.8889 1.1250 0.8399 0.9407
## `chemo treated:ch1`1 1.4926 0.6700 0.9158 2.4328
## `age at diagnosis:ch1` 1.0701 0.9345 1.0576 1.0828
## `tumor size:ch1` 1.0192 0.9811 1.0137 1.0248
## `er status:ch1`1 1.4053 0.7116 0.8339 2.3682
## `her2 status:ch1`1 1.1725 0.8529 0.8399 1.6367
## `pgr status:ch1`1 0.7577 1.3197 0.5087 1.1286
## ESR1:`chemo treated:ch1`1 0.8957 1.1164 0.8167 0.9823
##
## Concordance= 0.771 (se = 0.014 )
## Likelihood ratio test= 319 on 8 df, p=<2e-16
## Wald test = 324.6 on 8 df, p=<2e-16
## Score (logrank) test = 384.2 on 8 df, p=<2e-16
library(survminer)
## Loading required package: ggpubr
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
survival_plot <- ggsurvplot(survfit(cox_model), data = no_NA_reduced_merged_data, risk.table = TRUE)
## Warning in survfit.coxph(cox_model): the model contains interactions; the
## default curve based on columm means of the X matrix is almost certainly not
## useful. Consider adding a newdata argument.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggpubr package.
## Please report the issue at <https://github.com/kassambara/ggpubr/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(survival_plot)
## Ignoring unknown labels:
## • colour : "Strata"
summary(cox_model)
## Call:
## coxph(formula = Surv(`overall survival days:ch1`, `overall survival event:ch1`) ~
## ESR1 * `chemo treated:ch1` + `age at diagnosis:ch1` + `tumor size:ch1` +
## `er status:ch1` + `her2 status:ch1` + `pgr status:ch1`,
## data = no_NA_reduced_merged_data)
##
## n= 2938, number of events= 318
##
## coef exp(coef) se(coef) z Pr(>|z|)
## ESR1 -0.117756 0.888912 0.028908 -4.073 4.63e-05 ***
## `chemo treated:ch1`1 0.400522 1.492604 0.249252 1.607 0.1081
## `age at diagnosis:ch1` 0.067769 1.070118 0.005990 11.313 < 2e-16 ***
## `tumor size:ch1` 0.019055 1.019238 0.002783 6.846 7.60e-12 ***
## `er status:ch1`1 0.340241 1.405287 0.266274 1.278 0.2013
## `her2 status:ch1`1 0.159112 1.172469 0.170176 0.935 0.3498
## `pgr status:ch1`1 -0.277404 0.757749 0.203278 -1.365 0.1724
## ESR1:`chemo treated:ch1`1 -0.110152 0.895698 0.047096 -2.339 0.0193 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## ESR1 0.8889 1.1250 0.8399 0.9407
## `chemo treated:ch1`1 1.4926 0.6700 0.9158 2.4328
## `age at diagnosis:ch1` 1.0701 0.9345 1.0576 1.0828
## `tumor size:ch1` 1.0192 0.9811 1.0137 1.0248
## `er status:ch1`1 1.4053 0.7116 0.8339 2.3682
## `her2 status:ch1`1 1.1725 0.8529 0.8399 1.6367
## `pgr status:ch1`1 0.7577 1.3197 0.5087 1.1286
## ESR1:`chemo treated:ch1`1 0.8957 1.1164 0.8167 0.9823
##
## Concordance= 0.771 (se = 0.014 )
## Likelihood ratio test= 319 on 8 df, p=<2e-16
## Wald test = 324.6 on 8 df, p=<2e-16
## Score (logrank) test = 384.2 on 8 df, p=<2e-16
library(survival)
library(survminer)
# Create ESR1 groups
no_NA_reduced_merged_data$ESR1_group <- ifelse(no_NA_reduced_merged_data$ESR1 > 3, "High ESR1", "Low ESR1")
# Combine ESR1 and chemo into strata
no_NA_reduced_merged_data$chemo <- as.factor(no_NA_reduced_merged_data$`chemo treated:ch1`)
no_NA_reduced_merged_data$strata <- interaction(no_NA_reduced_merged_data$ESR1_group, no_NA_reduced_merged_data$`chemo`)
levels(no_NA_reduced_merged_data$strata) <- c("Low ESR1 + No Chemo", "Low ESR1 + Chemo", "High ESR1 + No Chemo", "High ESR1 + Chemo")
fit <- survfit(Surv(`overall survival days:ch1`, `overall survival event:ch1`) ~ strata, data = no_NA_reduced_merged_data)
library(grid)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
surv_plot_3 <- ggsurvplot(
fit,
data = no_NA_reduced_merged_data,
risk.table = TRUE,
risk.table.height = 0.3, # Increase space for the table
risk.table.fontsize = 3.5, # Reduce font size
risk.table.y.text.col = TRUE,
risk.table.y.text = FALSE,
break.time.by = 500, # Controls x-axis intervals
xlab = "Time (days)",
ylab = "Survival Probability",
title = "Survival by ESR1 Expression and Chemotherapy",
legend.title = "ESR1 × Chemo",
legend.labs = levels(no_NA_reduced_merged_data$strata),
palette = c("#D55E00", "#E69F00", "#56B4E9", "#009E73"),
size = 1.2,
linetype = "strata",
conf.int = TRUE
)
print(surv_plot_3)
## Ignoring unknown labels:
## • colour : "ESR1 × Chemo"